pbmc.data <- Read10X(data.dir = "/home/nastasista/Desktop/RNAseq/sc_RNA/t")
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
pbmc
## An object of class Seurat 
## 19420 features across 7903 samples within 1 assay 
## Active assay: RNA (19420 features, 0 variable features)
##  2 layers present: counts, data
rm(pbmc.data) 
# QC
meta <- pbmc@meta.data
dim(meta)
## [1] 7903    3
head(meta)

Объект Seurat(PBMC) содержит 19420 генов и 7903 клетки (features)

summary(meta$nCount_RNA)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     500    2617    3206    3715    4092   29348
summary(meta$nFeature_RNA)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     200     953    1131    1279    1386    5601
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
pbmc[["percent.rb"]] <- PercentageFeatureSet(pbmc, pattern = "^RP[SL]")
head(pbmc[[]])
VlnPlot(pbmc, features = c("nFeature_RNA","nCount_RNA","percent.mt","percent.rb"),ncol = 4,pt.size = 0.1) & 
  theme(plot.title = element_text(size=10))

plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2


FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.rb")
FeatureScatter(pbmc, feature1 = "percent.rb", feature2 = "percent.mt")

Фильтрация: nFeature_RNA(кол-во генов) больше 200 и меньше 2500(если выше - вероятно это дуплеты), фильтрация по митохондриальному контенту < 15, в соответсвии с предыдущим графиком(VlnPlot)

pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & 
                              nFeature_RNA < 2500 & 
                              percent.mt < 15)
meta <- pbmc@meta.data
dim(meta)
## [1] 7396    5

После фильтрации 7396 клеток

VlnPlot(pbmc, features = c("nFeature_RNA","nCount_RNA","percent.mt","percent.rb"),ncol = 4, pt.size = 0.1) & 
  theme(plot.title = element_text(size= 10))

pbmc <- NormalizeData(pbmc)
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
top10 <- head(VariableFeatures(pbmc), 10)
top10 #топ 10 вариабельных генов
##  [1] "S100A9" "LYZ"    "S100A8" "IGKC"   "PPBP"   "IGLC3"  "IGLC2"  "PF4"   
##  [9] "HBB"    "IGLC1"

Plot variable features with and without labels:

plot1 <- VariableFeaturePlot(pbmc)
LabelPoints(plot = plot1, points = top10, repel = TRUE, xnudge = 0, ynudge = 0)
## Warning: Transformation introduced infinite values in continuous x-axis

all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
## Centering and scaling data matrix
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
## PC_ 1 
## Positive:  RPS12, RPS18, IL7R, LTB, RPL10, RPL13, MALAT1, IL32, TRBC2, TPT1 
##     EEF1A1, TRBC1, ARL4C, LEF1, RPLP1, MAL, BCL2, CD69, RORA, SYNE2 
##     FHIT, AQP3, GZMM, CD8B, NELL2, LINC02446, ARID5B, TSHZ2, AP3M2, TBC1D4 
## Negative:  FCN1, CST3, IFI30, CYBB, MNDA, LYZ, FGL2, VCAN, S100A9, MPEG1 
##     SERPINA1, NCF2, S100A8, AC020656.1, MS4A6A, CD14, CTSS, LST1, CLEC7A, CD68 
##     CFD, SPI1, CSTA, TNFAIP2, LGALS2, PSAP, CD302, CPVL, S100A12, GRN 
## PC_ 2 
## Positive:  EEF1A1, MALAT1, RPL13, RPS18, RPL10, RPS12, RPLP1, TPT1, LTB, VIM 
##     IL7R, CD37, JUN, JUNB, HLA-DRA, AIF1, NFKBIZ, HLA-DQB1, S100A6, CD74 
##     MARCH1, SAMHD1, MAL, POU2F2, S100A10, CTSS, HLA-DRB5, NFKBIA, DUSP1, CYBB 
## Negative:  GP1BB, TUBB1, CAVIN2, GNG11, GP9, PF4, PPBP, PRKAR2B, NRGN, ACRBP 
##     PTCRA, CLU, F13A1, CMTM5, SPARC, RGS18, C2orf88, HIST1H2AC, TREML1, AC147651.1 
##     TMEM40, MMD, MPIG6B, ITGA2B, DAB2, RUFY1, MAP3K7CL, PF4V1, TSC22D1, CLEC1B 
## PC_ 3 
## Positive:  CD79A, IGHM, MS4A1, BANK1, IGHD, IGKC, RALGPS2, BCL11A, TNFRSF13C, LINC00926 
##     CD22, CD79B, HLA-DQA1, HLA-DRA, PAX5, LTB, AFF3, TCL1A, COBLL1, FCRL1 
##     FCER2, IGLC2, VPREB3, NIBAN3, MEF2C, BLK, ADAM28, MARCH1, HLA-DQB1, TCF4 
## Negative:  NKG7, GNLY, CST7, GZMA, FGFBP2, GZMB, KLRD1, PRF1, GZMH, CTSW 
##     CCL5, EFHD2, HOPX, ADGRG1, FCGR3A, CCL4, SPON2, KLRF1, TRDC, S100A4 
##     GZMM, S1PR5, IL2RB, ARL4C, ID2, CX3CR1, KLRK1, TYROBP, CLIC3, KLRB1 
## PC_ 4 
## Positive:  MS4A1, IGHM, CD79A, BANK1, IGHD, HLA-DPB1, CD79B, IGKC, NKG7, HLA-DPA1 
##     CD22, BCL11A, GNLY, LINC00926, TNFRSF13C, RALGPS2, GZMB, HLA-DQA1, CST7, PAX5 
##     FGFBP2, COBLL1, GZMA, CD74, KLRD1, HLA-DRA, PRF1, TCL1A, GZMH, ADAM28 
## Negative:  IL7R, TPT1, MAL, LEF1, LTB, VIM, IL6ST, RPS12, RPL13, EEF1A1 
##     FHIT, AIF1, RPLP1, IL32, CD4, RPS18, SAMHD1, AQP3, JUN, NFKBIZ 
##     JAML, TBC1D4, VCAN, TNFAIP3, S100A12, S100A8, NELL2, RPL10, NAP1L1, TOB1 
## PC_ 5 
## Positive:  S100A12, VCAN, S100A8, CD14, AC020656.1, MS4A6A, CSF3R, CD36, AC020916.1, CXCL8 
##     CLEC4E, ALDH1A1, PLBD1, MGST1, NCF1, LGALS2, CREB5, IER3, CYP1B1, RNASE6 
##     ALDH2, FOSB, CD163, CD93, TREM1, IL1B, GAS7, CCL3, S100A9, MNDA 
## Negative:  CDKN1C, HES4, TCF7L2, SMIM25, CSF1R, AC104809.2, MS4A7, LYPD2, HMOX1, FCGR3A 
##     ZNF703, SIGLEC10, CKB, VMO1, IFITM3, PPM1N, MS4A4A, RHOC, LINC02432, MTSS1 
##     FMNL2, IL3RA, MEG3, LRRC25, NEURL1, BATF3, CTSL, MAFB, RRAS, AC064805.1
DimPlot(pbmc, reduction = "pca")

print(pbmc[["pca"]], dims = 1:5, nfeatures = 5) 
## PC_ 1 
## Positive:  RPS12, RPS18, IL7R, LTB, RPL10 
## Negative:  FCN1, CST3, IFI30, CYBB, MNDA 
## PC_ 2 
## Positive:  EEF1A1, MALAT1, RPL13, RPS18, RPL10 
## Negative:  GP1BB, TUBB1, CAVIN2, GNG11, GP9 
## PC_ 3 
## Positive:  CD79A, IGHM, MS4A1, BANK1, IGHD 
## Negative:  NKG7, GNLY, CST7, GZMA, FGFBP2 
## PC_ 4 
## Positive:  MS4A1, IGHM, CD79A, BANK1, IGHD 
## Negative:  IL7R, TPT1, MAL, LEF1, LTB 
## PC_ 5 
## Positive:  S100A12, VCAN, S100A8, CD14, AC020656.1 
## Negative:  CDKN1C, HES4, TCF7L2, SMIM25, CSF1R
VizDimLoadings(pbmc, dims = 1:9, reduction = "pca") & 
  theme(axis.text=element_text(size=5), axis.title=element_text(size=8,face="bold"))

# Heatmaps of these genes
DimHeatmap(pbmc, dims = 1:12, nfeatures = 20, cells = 500, balanced = T)

ElbowPlot показывает какое количество принципиальных компонент объясняет вариабельность данных

ElbowPlot(pbmc) 

pbmc <- FindNeighbors(pbmc, dims = 1:10) #ищем соседей по кластеру
## Computing nearest neighbor graph
## Computing SNN
pbmc <- FindClusters(pbmc, resolution = 0.4) # Resolution may vary ~0.4-1.2, depending on how well (biologically) it describes clusters
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 7396
## Number of edges: 244698
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9140
## Number of communities: 11
## Elapsed time: 1 seconds
pbmc <- RunUMAP(pbmc, dims = 1:10, verbose = F)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## Found more than one class "dist" in cache; using the first, from namespace 'spam'
## Also defined by 'BiocGenerics'
## Found more than one class "dist" in cache; using the first, from namespace 'spam'
## Also defined by 'BiocGenerics'
head(Idents(pbmc), 5)
## AAACCCAAGCGTGAGT-1 AAACCCAGTCGACGCT-1 AAACCCATCAAGAGTA-1 AAACCCATCCACGTCT-1 
##                  1                  0                  0                  0 
## AAACCCATCCCTCTAG-1 
##                  1 
## Levels: 0 1 2 3 4 5 6 7 8 9 10

Кластеры и количество клеток в каждом кластере

table(pbmc@meta.data$seurat_clusters)
## 
##    0    1    2    3    4    5    6    7    8    9   10 
## 2347 1561  809  745  498  387  384  333  176  108   48
DimPlot(pbmc,label.size = 4,repel = T,label = T)

FeaturePlot(pbmc, features = c("S100A9", "IGLC1", "IGHM", "PF4"))

# QC check
FeaturePlot(pbmc, features = "percent.mt") & theme(plot.title = element_text(size=10))
FeaturePlot(pbmc, features = "percent.rb") & theme(plot.title = element_text(size=10))
FeaturePlot(pbmc, features = "nFeature_RNA") & theme(plot.title = element_text(size=10))
FeaturePlot(pbmc, features = "nCount_RNA") & theme(plot.title = element_text(size=10))

Гены клеточного цикла:

cc.genes.updated.2019
## $s.genes
##  [1] "MCM5"     "PCNA"     "TYMS"     "FEN1"     "MCM7"     "MCM4"    
##  [7] "RRM1"     "UNG"      "GINS2"    "MCM6"     "CDCA7"    "DTL"     
## [13] "PRIM1"    "UHRF1"    "CENPU"    "HELLS"    "RFC2"     "POLR1B"  
## [19] "NASP"     "RAD51AP1" "GMNN"     "WDR76"    "SLBP"     "CCNE2"   
## [25] "UBR7"     "POLD3"    "MSH2"     "ATAD2"    "RAD51"    "RRM2"    
## [31] "CDC45"    "CDC6"     "EXO1"     "TIPIN"    "DSCC1"    "BLM"     
## [37] "CASP8AP2" "USP1"     "CLSPN"    "POLA1"    "CHAF1B"   "MRPL36"  
## [43] "E2F8"    
## 
## $g2m.genes
##  [1] "HMGB2"   "CDK1"    "NUSAP1"  "UBE2C"   "BIRC5"   "TPX2"    "TOP2A"  
##  [8] "NDC80"   "CKS2"    "NUF2"    "CKS1B"   "MKI67"   "TMPO"    "CENPF"  
## [15] "TACC3"   "PIMREG"  "SMC4"    "CCNB2"   "CKAP2L"  "CKAP2"   "AURKB"  
## [22] "BUB1"    "KIF11"   "ANP32E"  "TUBB4B"  "GTSE1"   "KIF20B"  "HJURP"  
## [29] "CDCA3"   "JPT1"    "CDC20"   "TTK"     "CDC25C"  "KIF2C"   "RANGAP1"
## [36] "NCAPD2"  "DLGAP5"  "CDCA2"   "CDCA8"   "ECT2"    "KIF23"   "HMMR"   
## [43] "AURKA"   "PSRC1"   "ANLN"    "LBR"     "CKAP5"   "CENPE"   "CTCF"   
## [50] "NEK2"    "G2E3"    "GAS2L3"  "CBX5"    "CENPA"
s.genes <- cc.genes.updated.2019$s.genes
g2m.genes <- cc.genes.updated.2019$g2m.genes
pbmc <- CellCycleScoring(pbmc, s.features = s.genes, g2m.features = g2m.genes)
## Warning: The following features are not present in the object: E2F8, not
## searching for symbol synonyms
## Warning: The following features are not present in the object: PIMREG, CDC25C,
## NEK2, not searching for symbol synonyms
table(pbmc[[]]$Phase)
## 
##   G1  G2M    S 
## 2856 1867 2673
DimPlot(pbmc, group.by = "Phase") 
DimPlot(pbmc, split.by = "Phase")

FeaturePlot(pbmc,features = c("S.Score","G2M.Score"),label.size = 4,repel = T,label = T) & 
  theme(plot.title = element_text(size=10))

VlnPlot(pbmc,features = c("S.Score","G2M.Score")) & 
  theme(plot.title = element_text(size=10)) 

ПОИСК ГЕНОВ-МАРКЕРОВ ДЛЯ КЛАСТЕРОВ

cluster2.markers <- FindMarkers(pbmc, ident.1 = 2, min.pct = 0.25)
head(cluster2.markers, n = 5)

Второй кластер вероятно принадлежит T-лимфоцитам

Топ 2 гена-маркера для каждого из кластеров:

pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
pbmc.markers %>%
  group_by(cluster) %>%
  slice_max(n = 2, order_by = avg_log2FC)

Кластер 0

FeaturePlot(pbmc, features = c("IL6ST", "FHIT")) #cluster 0 - Endothelial cells 
VlnPlot(pbmc, features = c("IL6ST", "FHIT"))

Кластер 1

FeaturePlot(pbmc, features = c("TSHZ2", "BRWD1")) #cluster 1 - Neurons ???
VlnPlot(pbmc, features = c("TSHZ2", "BRWD1"))

Кластеры 2, 3, 5, 9

FeaturePlot(pbmc, features = c("LINC02446", "CD8B")) #cluster 2 - T-cells
FeaturePlot(pbmc, features = c("GZMH",  "TRDC")) #cluster 3 - Gamma delta T + NK cells
FeaturePlot(pbmc, features = c("IGKC", "IGHM")) #cluster 5 - B-cells
FeaturePlot(pbmc, features = c("CDKN1C", "LYPD2")) #cluster 9 - Fibroblasts + EC

pbmc.markers %>%
  group_by(cluster) %>%
  top_n(n = 10, wt = avg_log2FC) -> top10
DoHeatmap(pbmc, features = top10$gene) + NoLegend()

new.cluster.ids <- c("Endothelial cells", "Neurons", "T-cells", "Gamma delta T + NK cells", "Cluster 4", "B-cells", "Cluster 6", "Cluster 7", "Cluster 8", "Fibroblasts + EC", "Cluster 10")
names(new.cluster.ids) <- levels(pbmc)
pbmc <- RenameIdents(pbmc, new.cluster.ids)
DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()

Cell type annotation using SingleR

monaco.ref <- celldex::MonacoImmuneData()
## snapshotDate(): 2022-10-31
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
sce <- as.SingleCellExperiment(DietSeurat(pbmc))
## Warning: `invoke()` is deprecated as of rlang 0.4.0.
## Please use `exec()` or `inject()` instead.
## This warning is displayed once every 8 hours.
monaco.main <- SingleR(test = sce,assay.type.test = 1,ref = monaco.ref,labels = monaco.ref$label.main)
monaco.fine <- SingleR(test = sce,assay.type.test = 1,ref = monaco.ref,labels = monaco.ref$label.fine)
table(monaco.main$pruned.labels)
## 
##         B cells       Basophils    CD4+ T cells    CD8+ T cells Dendritic cells 
##             357               5            3431            1253              16 
##       Monocytes     Neutrophils        NK cells     Progenitors         T cells 
##             581              35             597              66             982
pbmc@meta.data$monaco.main <- monaco.main$pruned.labels
pbmc@meta.data$monaco.fine <- monaco.fine$pruned.labels
pbmc <- SetIdent(pbmc, value = "monaco.main")
DimPlot(pbmc, label = T , repel = T, label.size = 3) + NoLegend()